library(MendelianRandomization)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(ks)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.2 ✓ purrr 0.3.4
✓ tibble 3.0.3 ✓ stringr 1.4.0
✓ tidyr 1.1.0 ✓ forcats 0.5.0
✓ readr 1.3.1
── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(ggplot2)
library(stringr)
First let’s load our data into a dataframe. This dataframe has 5 non-null columns: 1. seq_num: Index of the DNA sequence from which the effect sizes and standard errors come. 2. X_pred_mean: Effect size for the \(Z \rightarrow X\) relationship. Determined by taking the difference between predicted transcription factor binding probability for a reference sequence and a mutated version of the reference. 3. X_pred_var: Standard error of the \(Z \rightarrow X\) effect size estimate. 4. Y_pred_mean: Effect size for the \(Z \rightarrow Y\) relationship. Determined by taking the difference between predicted chromatin accessibility probability for a reference sequence and a mutated version of the reference. 5. Y_pred_var: Standard error of the \(Z \rightarrow Y\) effect size estimate.
data_dir <- "../../dat/"
results_dir <- "../../dat/"
exposure_tf <- params$exposure_tf
outcome_tf <- params$outcome_tf
results_fname <- str_c(exposure_tf, outcome_tf, "effect_sizes.csv", sep = "_")
seq_predictions <- read.csv(file.path(results_dir, results_fname))
is.nan.data.frame <- function(x) {
do.call(cbind, lapply(x, is.nan))
}
seq_predictions = seq_predictions %>% drop_na()
n_seqs <- max(seq_predictions["seq_num"])
print(str_c("Running on ", n_seqs, " sequences' predictions for exposure ", exposure_tf, " and outcome ", outcome_tf))
[1] "Running on 60 sequences' predictions for exposure Nanog and outcome Sox2"
seq_predictions
ivw_vals <- list()
egger_vals <- list()
egger_result <- matrix(ncol = 8, nrow = n_seqs)
plots <- c()
for (seq in (1:n_seqs)) {
seq_i_predictions <- subset(seq_predictions, seq_num == seq)
bxt <- unlist(seq_i_predictions["X_pred_mean"])
bxset <- unlist(seq_i_predictions["X_pred_var"])
byt <- unlist(seq_i_predictions["Y_pred_mean"])
byset <- unlist(seq_i_predictions["Y_pred_var"])
MRInputObject <- mr_input(
bx = bxt,
bxse = bxset,
by = byt,
byse = byset
)
EggerObject <- mr_egger(MRInputObject)
plots <- c(plots, mr_plot(MRInputObject))
egger_result[seq, ] <- c(
EggerObject$Estimate,
EggerObject$CILower.Est,
EggerObject$CIUpper.Est,
EggerObject$I.sq,
EggerObject$Pleio.pval,
EggerObject$StdError.Est,
EggerObject$Intercept,
EggerObject$Pvalue.Est
)
}
colnames(egger_result) <- c(
"ce",
"cil",
"ciu",
"i.sq",
"pleio",
"std",
"int",
"pval"
)
egger_result <- as_tibble(egger_result)
# egger_result$int
# egger_result$pleio
egger_result$pval
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
egger_result$cil
[1] 0.6624111 0.4946017 0.2596899 0.3392746 0.2643942 0.3214277 0.3202355 0.4878342 0.3956594 0.4714063 0.4591280 0.4529287 0.1941315 0.3179280 0.4748364 0.2723454 0.3476054 0.3609901 0.5434091 0.3501268 0.3697978 0.1090658 0.3051474 0.3916034 0.3526319
[26] 0.3088198 0.5313684 0.2309187 0.6876855 0.5278499 0.2404396 0.3849298 0.4827181 0.5442420 0.6083380 0.5331485 0.3213684 0.3346143 0.3504754 0.4524046 0.3085015 0.5211029 0.5249166 0.3742332 0.4465752 0.4628427 0.5612626 0.4826918 0.5524170 0.5218692
[51] 0.5326102 0.4124616 0.3531925 0.4569031 0.3447336 0.2388891 0.5691459 0.4368713 0.4230229 0.5664108
egger_result$ciu
[1] 0.6925329 0.5313000 0.2936519 0.3674658 0.2868699 0.3560291 0.3481173 0.5225789 0.4305670 0.5033834 0.4898804 0.4843012 0.2326947 0.3654092 0.5080045 0.3106776 0.3828946 0.4023319 0.5752519 0.3835175 0.4043459 0.1375472 0.3386546 0.4217517 0.3915041
[26] 0.3429232 0.5702170 0.2619052 0.7127931 0.5620440 0.2704281 0.4131669 0.5121622 0.5792549 0.6412172 0.5644271 0.3477196 0.3631897 0.3827492 0.4843635 0.3397468 0.5549002 0.5585180 0.4082484 0.4824702 0.4942072 0.5997734 0.5204230 0.5942819 0.5529227
[51] 0.5619741 0.4438768 0.3866925 0.4862023 0.3754247 0.2682767 0.6059248 0.4702945 0.4504525 0.5946959
Now we have the causal effect estimates, confidence intervals, and (for Egger) pleiotropy p-values.
egger_result
Let’s summarize the IVW & Egger causal effect estimates as histograms. This will give us an idea of how heterogeneous the alleged causal relationships are at different regions of the genome.
d <- density(egger_result$ce)
ce_smoothed <- ks::kde(
x = egger_result$ce,
binned = TRUE,
compute.cont = TRUE,
xmin = c(min(egger_result$ce) - 1),
xmax = c(max(egger_result$ce) + 1),
bgridsize = c(200)
)
if (params$save_plots) {
output_fname <- str_c(cell_type, tf, "ces_kde.png", sep = "_")
output_fpath <- file.path(params$output_dir, output_fname)
png(output_fpath, width = 512, height = 320)
par(mar = c(5, 6, 5, 1) + .1)
}
plot(
ce_smoothed,
xlab = "Causal Effect",
# main = str_c("Causal Effect Estimates (", tf, ")"),
cex.lab = 2,
cex.main = 2,
ylab = "Density"
)
if (params$save_plots) {
dev.off()
}
sorted_idxs <- sort(egger_result$ce, index.return=TRUE)$ix
median_idx <- sorted_idxs[length(sorted_idxs) / 2 + 1]
seq_median_preds <- subset(seq_predictions, seq_num == median_idx)
bxt <- unlist(seq_median_preds["X_pred_mean"])
bxset <- unlist(seq_median_preds["X_pred_var"])
byt <- unlist(seq_median_preds["Y_pred_mean"])
byset <- unlist(seq_median_preds["Y_pred_var"])
MRInputObject <- mr_input(
bx = bxt,
bxse = bxset,
by = byt,
byse = byset
)
MedianEggerObj <- mr_egger(MRInputObject)
mr_plot(MRInputObject, line="egger")
if (params$save_plots) {
png(file.path(params$output_dir, output_fname), width = 512, height = 320)
}
par(mfrow=c(2, 2))
for (seq in (1:n_seqs)) {
seq_i_predictions <- subset(seq_predictions, seq_num == seq)
bxset <- unlist(seq_i_predictions["X_pred_var"])
byset <- unlist(seq_i_predictions["Y_pred_var"])
plot(bxset, byset, main = str_c("CA vs. TF std error"))
}
if (params$save_plots) {
dev.off()
}
dmode <- function(x) {
den <- density(x, kernel = c("gaussian"))
(den$x[den$y == max(den$y)])
}
dmode(egger_result$ce)
median(egger_result$ce)
length(egger_result$ce[egger_result$ce < 0])
length(egger_result$ce)
Let’s also look at the distribution of pleiotropy p-values for different sequences. If they’re concentrated around .5, this means that the algorithm believes there to be very little pleiotropy.
qplot(pleio, data = egger_result, binwidth = .04)
p <- ggplot2::ggplot(egger_result, aes(x = ce, y = std)) +
geom_point()
p
if (params$save_plots) {
output_fname <- str_c(cell_type, tf, "ex_eff_sizes.png", sep = "_")
output_fpath <- file.path(params$output_dir, output_fname)
png(output_fpath, width = 512, height = 320)
par(mar = c(5, 6, 5, 1) + .1)
}
plot(bxt, byt, xlab=str_c(exposure_tf, " Effect Sizes"), ylab=str_c(outcome_tf, " Effect Sizes"))
if (params$save_plots) {
dev.off()
}